a = read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-10-27/wind-turbine.csv")##
## -- Column specification --------------------------------------------------------
## cols(
## objectid = col_double(),
## province_territory = col_character(),
## project_name = col_character(),
## total_project_capacity_mw = col_double(),
## turbine_identifier = col_character(),
## turbine_number_in_project = col_character(),
## turbine_rated_capacity_k_w = col_double(),
## rotor_diameter_m = col_double(),
## hub_height_m = col_double(),
## manufacturer = col_character(),
## model = col_character(),
## commissioning_date = col_character(),
## latitude = col_double(),
## longitude = col_double(),
## notes = col_character()
## )
#abak = a
a = readRDS('a.rds') %>%
clean_names() %>%
mutate(across(where(is.character),factor)) %>%
select(sort(tidyselect::peek_vars())) %>%
select(
where(is.Date),
where(is.factor),
where(is.numeric)
) %>% slice_sample(prop = 0.333) #for speed, ony using 1/3 of orig dataset
a %>% sample_n(5)#remove unnecessary cols
a = a %>% select(
-notes,
-turbine_identifier,
-turbine_number_in_project,
-objectid,
-total_project_capacity_mw,
#removing the vars below b/c it would be unrealistic to use them for predictions
#we likely wouldn't have such information with new data
-model,
-latitude,
-longitude,
-project_name
)
a %>% sample_n(5)#normally I would do more thorough EDA, but the purpose of this notebook is to practice creating and interpreting XGBoost Models
library(inspectdf)## Warning: package 'inspectdf' was built under R version 4.0.3
## $commissioning_date
## # A tibble: 31 x 3
## value prop cnt
## <chr> <dbl> <int>
## 1 2014 0.119 257
## 2 2013 0.104 224
## 3 2011 0.0868 187
## 4 2015 0.0789 170
## 5 2009 0.0771 166
## 6 2006 0.0720 155
## 7 2012 0.0548 118
## 8 2010 0.0460 99
## 9 2016 0.0404 87
## 10 2008 0.0306 66
## # ... with 21 more rows
##
## $manufacturer
## # A tibble: 21 x 3
## value prop cnt
## <chr> <dbl> <int>
## 1 Vestas 0.287 619
## 2 GE 0.273 588
## 3 Enercon 0.149 322
## 4 Siemens 0.143 307
## 5 Senvion 0.105 227
## 6 NEG Micon 0.0209 45
## 7 Acciona 0.00418 9
## 8 Acciona Wind Power 0.00418 9
## 9 Nordex 0.00418 9
## 10 Samsung Renewable Energy 0.00186 4
## # ... with 11 more rows
##
## $province_territory
## # A tibble: 11 x 3
## value prop cnt
## <chr> <dbl> <int>
## 1 Ontario 0.371 800
## 2 Quebec 0.317 682
## 3 Alberta 0.141 304
## 4 British Columbia 0.0478 103
## 5 Nova Scotia 0.0404 87
## 6 Saskatchewan 0.0232 50
## 7 New Brunswick 0.0200 43
## 8 Prince Edward Island 0.0186 40
## 9 Manitoba 0.0153 33
## 10 Newfoundland and Labrador 0.00511 11
## 11 Northwest Territories 0.000464 1
## Warning: package 'WVPlots' was built under R version 4.0.3
#https://cran.r-project.org/web/packages/WVPlots/vignettes/WVPlots_examples.html
a %>% ScatterHist('rotor_diameter_m','turbine_rated_capacity_k_w', smoothmethod = 'lm','rotor diameter vs turbine output')## Warning: Removed 3 rows containing missing values (geom_smooth).
a %>% ScatterHist('hub_height_m','turbine_rated_capacity_k_w', smoothmethod = 'lm','hub height vs turbine output')## Warning: package 'ggpointdensity' was built under R version 4.0.3
rec = train %>% recipe(turbine_rated_capacity_k_w ~ . ) %>%
step_zv(all_numeric(), - all_outcomes()) %>%
step_nzv(all_numeric(), - all_outcomes()) %>%
step_corr(all_numeric(), - all_outcomes()) %>%
#----------------------------
step_other(manufacturer, - all_outcomes()) %>%
step_normalize(all_numeric(), - all_outcomes()) %>%
step_dummy(all_nominal(), - all_outcomes(), one_hot = TRUE)
baked.train = rec %>% prep %>% bake(new_data = NULL)
baked.test = rec %>% prep %>% bake(new_data = test)
#---------------------------------------------------
mdl = parsnip::boost_tree(
trees = tune(),
min_n = tune(),
tree_depth = tune(),
learn_rate = tune(),
loss_reduction = tune(),
sample_size = tune()
) %>%
set_mode("regression") %>%
set_engine("xgboost")
#---------------------------------------------------
wf = workflow() %>%
add_recipe(rec) %>%
add_model(mdl)set.seed(345)
doParallel::registerDoParallel()
tg = tune_grid(
object = wf,
resamples = vfold,
grid = 10
)
#---------------------------------------------------
wf = wf %>% finalize_workflow(tg %>% select_best())## Warning: No value of `metric` was given; metric 'rmse' will be used.
## parsnip model object
##
## Fit time: 5.1s
## ##### xgb.Booster
## raw: 1 Mb
## call:
## xgboost::xgb.train(params = list(eta = 0.0132231849539539, max_depth = 12L,
## gamma = 0.0000000985063220750636, colsample_bytree = 1, min_child_weight = 19L,
## subsample = 0.669344357615337), data = x$data, nrounds = 598L,
## watchlist = x$watchlist, verbose = 0, objective = "reg:squarederror",
## nthread = 1)
## params (as set within xgb.train):
## eta = "0.0132231849539539", max_depth = "12", gamma = "0.0000000985063220750636", colsample_bytree = "1", min_child_weight = "19", subsample = "0.669344357615337", objective = "reg:squarederror", nthread = "1", validate_parameters = "TRUE"
## xgb.attributes:
## niter
## callbacks:
## cb.evaluation.log()
## # of features: 55
## niter: 598
## nfeatures : 55
## evaluation_log:
## iter training_rmse
## 1 2034
## 2 2008
## ---
## 597 64
## 598 64
kekka %>% collect_predictions() %>%
select(.pred, turbine_rated_capacity_k_w) %>%
metric_set(rmse, mae, mape, rsq)(estimate = .pred, truth = turbine_rated_capacity_k_w)ggplotly(
kekka %>% collect_predictions() %>%
select(.pred, turbine_rated_capacity_k_w) %>%
ggplot(aes(.pred, turbine_rated_capacity_k_w)) +
geom_abline(slope = 1, linetype = 'dotted', color = 'tomato3') +
geom_point(alpha = 0.7, size = 1.5, color = 'slateblue3') +
labs(
title = 'Turbine Capacity (KW) Predictions vs Actuals', x = 'Predicted', y = 'Actuals'
)
)#SHAPforxgboost
X_train = baked.train %>% select(-turbine_rated_capacity_k_w) %>% as.matrix
y = baked.train$turbine_rated_capacity_k_w
(best.hps = tg %>% select_best())## Warning: No value of `metric` was given; metric 'rmse' will be used.
## Warning: package 'xgboost' was built under R version 4.0.3
mdl.xgb = xgboost(
data = X_train,
label = y,
max.depth = best.hps$tree_depth,
nrounds = best.hps$trees,
eta = best.hps$learn_rate,
min.child.weight = best.hps$min_n,
gamma = best.hps$loss_reduction,
subsample = best.hps$sample_size,
#----------------------------
nthread = parallel::detectCores() - 2,
verbose = FALSE,
objective = 'reg:squarederror'
)shap.vals = shap.values(
mdl.xgb, X_train
)
#----------------------------individual observation's SHAP values
observation = 88
shap.vals$shap_score %>% dplyr::slice(observation)shap.vals$shap_score %>% dplyr::slice(observation) %>%
pivot_longer(everything()) %>%
rename('feature' = 'name', 'SHAP' = 'value') %>%
filter(SHAP > 1 | SHAP < -1) %>%
mutate(
feature = fct_reorder(feature, SHAP),
SHAP = round(SHAP, 1)
) %>%
plot_ly(x = ~SHAP, y = ~feature, color = ~if_else(SHAP < 0, I('darkred'), I('darkgreen'))) %>%
layout(
title = paste0('Local: Significant SHAP Values for Observation ', observation),
xaxis = list(title = paste0(
'Cumulative Total SHAP Values: ',
round(shap.vals$shap_score %>% dplyr::slice(observation) %>% rowSums(), 2)
)),
yaxis = list(title = ''))## No trace type specified:
## Based on info supplied, a 'bar' trace seems appropriate.
## Read more about this trace type -> https://plot.ly/r/reference/#bar
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## Warning in if (is.na(col)) {: the condition has length > 1 and only the first
## element will be used
## Warning in if (is.na(col)) {: the condition has length > 1 and only the first
## element will be used
## Warning: `error_y.color` does not currently support multiple values.
## Warning: `error_x.color` does not currently support multiple values.
#----------------------------top 10 Features avg SHAP values
n = 10
tibble(
features = shap.vals$mean_shap_score %>% sort(decreasing = TRUE) %>% head(n) %>% names,
SHAP = shap.vals$mean_shap_score %>% sort(decreasing = TRUE) %>% head(n)
) %>% mutate(features = fct_reorder(features, SHAP)) %>%
plot_ly(x = ~SHAP, y = ~features) %>%
layout(
title = paste0('Global: Average SHAP for Top ', n ,' Features'),
xaxis = list(title = ''),
yaxis = list(title = '')
)## No trace type specified:
## Based on info supplied, a 'bar' trace seems appropriate.
## Read more about this trace type -> https://plot.ly/r/reference/#bar
ggplotly(
baked.train %>%
mutate(manufacturer_GE = if_else(manufacturer_GE == 1, 'GE','Other')) %>%
ggplot(aes(manufacturer_GE, turbine_rated_capacity_k_w, group = manufacturer_GE)) +
geom_boxplot(aes(fill = manufacturer_GE), alpha = 0.50) +
labs(
title = 'Distribution: GE turbine capacity (KW) vs Other Manufacturers',
y = '', x = ''
) + scale_fill_manual(values = c('dodgerblue3','grey60')) +
theme(legend.position = 'none')
)## Warning: `group_by_()` is deprecated as of dplyr 0.7.0.
## Please use `group_by()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
#----------------------------
ggplotly(
baked.train %>%
mutate(manufacturer_Enercon = if_else(manufacturer_Enercon == 1, 'Enercon','Other')) %>%
ggplot(aes(manufacturer_Enercon, turbine_rated_capacity_k_w, group = manufacturer_Enercon)) +
geom_boxplot(aes(fill = manufacturer_Enercon), alpha = 0.50) +
labs(
title = 'Distribution: Enercon turbine capacity (KW) vs Other Manufacturers',
y = '', x = ''
) + scale_fill_manual(values = c('seagreen4','grey60')) +
theme(legend.position = 'none')
)n = 5
top.n.features = shap.vals$mean_shap_score %>% sort(decreasing = TRUE) %>% head(n) %>% names
X_train.long = shap.prep(xgb_model = mdl.xgb, X_train = X_train)
map(
.x = top.n.features,
~shap.plot.dependence(
data_long = X_train.long,
x = .x,
size0 = 1.5,
add_hist = TRUE
)
)## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 8.1421e-029
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1.01
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 8.1421e-029
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1.01
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 2.5e-005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1.01
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger
## Warning: Computation failed in `stat_smooth()`:
## NA/NaN/Inf in foreign function call (arg 5)
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : radius 2.5e-005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1.01
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : zero-width neighborhood. make span bigger
## Warning: Computation failed in `stat_smooth()`:
## NA/NaN/Inf in foreign function call (arg 5)
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 5.4265e-029
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1.01
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : pseudoinverse used at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : neighborhood radius 1.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : reciprocal condition number 5.4265e-029
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
## parametric, : There are other near singularities as well. 1.01
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
#The SHAP interaction values take time since it calculates all the combinations.
shap.plot.dependence(
data_long = X_train.long,
data_int = shap.prep.interaction(
xgb_model = mdl.xgb,
X_train = X_train
),
x = 'rotor_diameter_m',
y = 'hub_height_m'
)## `geom_smooth()` using formula 'y ~ x'
## The SHAP values of the Rest 52 features were summed into variable 'rest_variables'.
## Data has N = 1617 | zoom in length is 150 at location 970.2.